package com.huan.es8.aggregations.bucket;

import co.elastic.clients.elasticsearch.core.SearchRequest;
import co.elastic.clients.elasticsearch.core.SearchResponse;
import com.huan.es8.AbstractEs8Api;
import org.junit.jupiter.api.*;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;

/**
 * 稀少的term聚合，类似按照 _count asc 排序的terms聚合，但是terms聚合中按照_count asc的结果是不准的，需要使用 rare terms 聚合
 *
 * @author huan.fu
 * @date 2022/11/13 - 23:12
 * @see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-rare-terms-aggregation.html">参考文档</a>
 * @see <a href="https://blog.csdn.net/fu_huo_1993/article/details/127840024">https://blog.csdn.net/fu_huo_1993/article/details/127840024</a>
 */
@TestInstance(TestInstance.Lifecycle.PER_CLASS)

public class RareTermsAggs extends AbstractEs8Api {
    @BeforeAll
    public void createIndex() throws IOException {
        createIndex("index_person",
                "{\n" +
                        "  \"settings\": {\n" +
                        "    \"number_of_shards\": 1\n" +
                        "  },\n" +
                        "  \"mappings\": {\n" +
                        "    \"properties\": {\n" +
                        "      \"id\": {\n" +
                        "        \"type\": \"long\"\n" +
                        "      },\n" +
                        "      \"name\": {\n" +
                        "        \"type\": \"keyword\"\n" +
                        "      },\n" +
                        "      \"province\": {\n" +
                        "        \"type\": \"keyword\",\n" +
                        "        \"copy_to\": [\"province_sex\"]\n" +
                        "      },\n" +
                        "      \"sex\": {\n" +
                        "        \"type\": \"keyword\",\n" +
                        "        \"copy_to\": [\"province_sex\"]\n" +
                        "      },\n" +
                        "      \"age\": {\n" +
                        "        \"type\": \"integer\"\n" +
                        "      },\n" +
                        "      \"address\": {\n" +
                        "        \"type\": \"text\",\n" +
                        "        \"analyzer\": \"ik_max_word\",\n" +
                        "        \"fields\": {\n" +
                        "          \"keyword\": {\n" +
                        "            \"type\": \"keyword\",\n" +
                        "            \"ignore_above\": 256\n" +
                        "          }\n" +
                        "        }\n" +
                        "      },\n" +
                        "      \"province_sex\":{\n" +
                        "        \"type\":\"keyword\"\n" +
                        "      }\n" +
                        "    }\n" +
                        "  }\n" +
                        "}");

        bulk("index_person", Arrays.asList(
                "{\"id\":1,\"name\":\"张三\",\"sex\":\"男\",\"age\":20,\"province\":\"湖北\",\"address\":\"湖北省黄冈市罗田县匡河镇\"}",
                "{\"id\":2,\"name\":\"李四\",\"sex\":\"男\",\"age\":19,\"province\":\"江苏\",\"address\":\"江苏省南京市\"}",
                "{\"id\":3,\"name\":\"王武\",\"sex\":\"女\",\"age\":25,\"province\":\"湖北\",\"address\":\"湖北省武汉市江汉区\"}",
                "{\"id\":4,\"name\":\"赵六\",\"sex\":\"女\",\"age\":30,\"province\":\"北京\",\"address\":\"北京市东城区\"}",
                "{\"id\":5,\"name\":\"钱七\",\"sex\":\"女\",\"age\":16,\"province\":\"北京\",\"address\":\"北京市西城区\"}",
                "{\"id\":6,\"name\":\"王八\",\"sex\":\"女\",\"age\":45,\"province\":\"北京\",\"address\":\"北京市朝阳区\"}",
                "{\"id\":7,\"name\":\"九哥\",\"sex\":\"男\",\"age\":25,\"province\":\"上海市\",\"address\":\"上海市嘉定区\"}"
        ));
    }

    @Test
    @DisplayName("稀少的term聚合，类似按照 _count asc 排序的terms聚合，但是terms聚合中按照_count asc的结果是不准的，需要使用 rare terms 聚合")
    public void agg01() throws IOException {

        SearchRequest searchRequest = new SearchRequest.Builder()
                .size(0)
                .index("index_person")
                .aggregations("agg_province", agg ->
                        agg.rareTerms(rare ->
                                // 稀有词 的字段
                                rare.field("province")
                                        // 该稀有词最多可以出现在几个文档中，最大值为100，如果要调整，需要修改search.max_buckets参数的值(尝试修改这个值，不生效)
                                        // 在该例子中，只要是出现的次数<=2的聚合都会返回
                                        .maxDocCount(2L)
                                        // 内部布谷鸟过滤器的精度，精度越小越准，但是相应的消耗内存也越多，最小值为 0.00001，默认值为 0.01
                                        .precision(0.01)
                                        // 应该包含在聚合的term, 当是单个字段是，可以写正则表达式
                                        .include(include -> include.regexp("(.*上.*|.*湖.*|.*江.*)"))
                                        // 排出在聚合中的term，当是集合时，需要写准确的值
                                        .exclude(exclude -> exclude.terms(Collections.singletonList("江苏")))
                                        // 当文档中缺失province字段时，给默认值
                                        .missing("default省")
                        )
                )
                .build();
        System.out.println(searchRequest);
        SearchResponse<Object> response = client.search(searchRequest, Object.class);
        System.out.println(response);
    }

    @AfterAll
    public void deleteIndex() throws IOException {
        deleteIndex("index_person");
    }
}
